The goal of this project is to predict house prices. The approach which we used begins with Exploratory Data Analysis (EDA). We then used Data visualization and plots with ggplot to study the data. We cleaned the data and preprocessed it to create a model. We used SalePrice as our main predictor and then tried to find correlated variables to see what variables would be useful in predicting house prices. We narrowed it down to the best variables and then we used a linear regression model to predict the home prices with the variables that add value to home prices. To verify our results, we cross-validated our predicted values, which allowed us to see if the results were consistent among the different models employed.
This code will install the missing libraries and then load them.
packages <- c("ggplot2","dplyr","plyr","tidyverse","data.table","knitr","kableExtra","caret","corrplot","randomForest","Hmisc","moments","psych","glmnet","scales","ggrepel","cowplot")
for(pkg in packages){
if (pkg %in% rownames(installed.packages()) == FALSE)
{install.packages(pkg)}
if (pkg %in% rownames(.packages()) == FALSE)
{library(pkg, character.only = TRUE)}
}
We are using
read.csvto read data into train and test variables, setting character variables to be loaded as characters instead of factors.
train <- read.csv("ames_train.csv", stringsAsFactors = FALSE)
test <- read.csv("ames_test.csv", stringsAsFactors = FALSE)
dim(train)
## [1] 1460 81
dim(test)
## [1] 1459 80
glimpse(train)
## Rows: 1,460
## Columns: 81
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R…
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
## $ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", …
## $ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", …
## $ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu…
## $ LotConfig <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I…
## $ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", …
## $ Neighborhood <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "…
## $ Condition1 <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
## $ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
## $ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", …
## $ HouseStyle <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi…
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G…
## $ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "…
## $ Exterior1st <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "…
## $ Exterior2nd <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "…
## $ MasVnrType <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",…
## $ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ Foundation <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "…
## $ BsmtQual <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T…
## $ BsmtCond <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T…
## $ BsmtExposure <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N…
## $ BsmtFinType1 <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", …
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2 <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", …
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
## $ HeatingQC <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E…
## $ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S…
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", …
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", …
## $ GarageType <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch…
## $ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", …
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G…
## $ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,…
## $ MiscFeature <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, …
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm…
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
Adding SalePrice column in the test dataset to make the variables equal in both train and test datasets.
test$SalePrice <- NA
complete_data_set <- rbind(train, test) #Binding train & test data
complete_data_set <- cbind(complete_data_set, Set = c(rep("Train", times = dim(train)[1]),
rep("Test", times = dim(test)[1])))
dim(complete_data_set)
## [1] 2919 82
Checking the summary of all the variables like mean, median and sd
describe(complete_data_set, fast=TRUE)
## vars n mean sd min max range se
## Id 1 2919 1460.00 842.79 1 2919 2918 15.60
## MSSubClass 2 2919 57.14 42.52 20 190 170 0.79
## MSZoning 3 2915 NaN NA Inf -Inf -Inf NA
## LotFrontage 4 2433 69.31 23.34 21 313 292 0.47
## LotArea 5 2919 10168.11 7887.00 1300 215245 213945 145.98
## Street 6 2919 NaN NA Inf -Inf -Inf NA
## Alley 7 198 NaN NA Inf -Inf -Inf NA
## LotShape 8 2919 NaN NA Inf -Inf -Inf NA
## LandContour 9 2919 NaN NA Inf -Inf -Inf NA
## Utilities 10 2917 NaN NA Inf -Inf -Inf NA
## LotConfig 11 2919 NaN NA Inf -Inf -Inf NA
## LandSlope 12 2919 NaN NA Inf -Inf -Inf NA
## Neighborhood 13 2919 NaN NA Inf -Inf -Inf NA
## Condition1 14 2919 NaN NA Inf -Inf -Inf NA
## Condition2 15 2919 NaN NA Inf -Inf -Inf NA
## BldgType 16 2919 NaN NA Inf -Inf -Inf NA
## HouseStyle 17 2919 NaN NA Inf -Inf -Inf NA
## OverallQual 18 2919 6.09 1.41 1 10 9 0.03
## OverallCond 19 2919 5.56 1.11 1 9 8 0.02
## YearBuilt 20 2919 1971.31 30.29 1872 2010 138 0.56
## YearRemodAdd 21 2919 1984.26 20.89 1950 2010 60 0.39
## RoofStyle 22 2919 NaN NA Inf -Inf -Inf NA
## RoofMatl 23 2919 NaN NA Inf -Inf -Inf NA
## Exterior1st 24 2918 NaN NA Inf -Inf -Inf NA
## Exterior2nd 25 2918 NaN NA Inf -Inf -Inf NA
## MasVnrType 26 2895 NaN NA Inf -Inf -Inf NA
## MasVnrArea 27 2896 102.20 179.33 0 1600 1600 3.33
## ExterQual 28 2919 NaN NA Inf -Inf -Inf NA
## ExterCond 29 2919 NaN NA Inf -Inf -Inf NA
## Foundation 30 2919 NaN NA Inf -Inf -Inf NA
## BsmtQual 31 2838 NaN NA Inf -Inf -Inf NA
## BsmtCond 32 2837 NaN NA Inf -Inf -Inf NA
## BsmtExposure 33 2837 NaN NA Inf -Inf -Inf NA
## BsmtFinType1 34 2840 NaN NA Inf -Inf -Inf NA
## BsmtFinSF1 35 2918 441.42 455.61 0 5644 5644 8.43
## BsmtFinType2 36 2839 NaN NA Inf -Inf -Inf NA
## BsmtFinSF2 37 2918 49.58 169.21 0 1526 1526 3.13
## BsmtUnfSF 38 2918 560.77 439.54 0 2336 2336 8.14
## TotalBsmtSF 39 2918 1051.78 440.77 0 6110 6110 8.16
## Heating 40 2919 NaN NA Inf -Inf -Inf NA
## HeatingQC 41 2919 NaN NA Inf -Inf -Inf NA
## CentralAir 42 2919 NaN NA Inf -Inf -Inf NA
## Electrical 43 2918 NaN NA Inf -Inf -Inf NA
## X1stFlrSF 44 2919 1159.58 392.36 334 5095 4761 7.26
## X2ndFlrSF 45 2919 336.48 428.70 0 2065 2065 7.93
## LowQualFinSF 46 2919 4.69 46.40 0 1064 1064 0.86
## GrLivArea 47 2919 1500.76 506.05 334 5642 5308 9.37
## BsmtFullBath 48 2917 0.43 0.52 0 3 3 0.01
## BsmtHalfBath 49 2917 0.06 0.25 0 2 2 0.00
## FullBath 50 2919 1.57 0.55 0 4 4 0.01
## HalfBath 51 2919 0.38 0.50 0 2 2 0.01
## BedroomAbvGr 52 2919 2.86 0.82 0 8 8 0.02
## KitchenAbvGr 53 2919 1.04 0.21 0 3 3 0.00
## KitchenQual 54 2918 NaN NA Inf -Inf -Inf NA
## TotRmsAbvGrd 55 2919 6.45 1.57 2 15 13 0.03
## Functional 56 2917 NaN NA Inf -Inf -Inf NA
## Fireplaces 57 2919 0.60 0.65 0 4 4 0.01
## FireplaceQu 58 1499 NaN NA Inf -Inf -Inf NA
## GarageType 59 2762 NaN NA Inf -Inf -Inf NA
## GarageYrBlt 60 2760 1978.11 25.57 1895 2207 312 0.49
## GarageFinish 61 2760 NaN NA Inf -Inf -Inf NA
## GarageCars 62 2918 1.77 0.76 0 5 5 0.01
## GarageArea 63 2918 472.87 215.39 0 1488 1488 3.99
## GarageQual 64 2760 NaN NA Inf -Inf -Inf NA
## GarageCond 65 2760 NaN NA Inf -Inf -Inf NA
## PavedDrive 66 2919 NaN NA Inf -Inf -Inf NA
## WoodDeckSF 67 2919 93.71 126.53 0 1424 1424 2.34
## OpenPorchSF 68 2919 47.49 67.58 0 742 742 1.25
## EnclosedPorch 69 2919 23.10 64.24 0 1012 1012 1.19
## X3SsnPorch 70 2919 2.60 25.19 0 508 508 0.47
## ScreenPorch 71 2919 16.06 56.18 0 576 576 1.04
## PoolArea 72 2919 2.25 35.66 0 800 800 0.66
## PoolQC 73 10 NaN NA Inf -Inf -Inf NA
## Fence 74 571 NaN NA Inf -Inf -Inf NA
## MiscFeature 75 105 NaN NA Inf -Inf -Inf NA
## MiscVal 76 2919 50.83 567.40 0 17000 17000 10.50
## MoSold 77 2919 6.21 2.71 1 12 11 0.05
## YrSold 78 2919 2007.79 1.31 2006 2010 4 0.02
## SaleType 79 2918 NaN NA Inf -Inf -Inf NA
## SaleCondition 80 2919 NaN NA Inf -Inf -Inf NA
## SalePrice 81 1460 180921.20 79442.50 34900 755000 720100 2079.11
## Set 82 2919 NaN NA Inf -Inf -Inf NA
#To check the class (character, numeric, integer) of the variables in the data set
table(sapply(complete_data_set, class))
##
## character integer
## 44 38
The dataset consists of 43 characters and 37 integer variables, and 35 columns with missing values (NA values).
# Building a dataset with response variable SalePrice and its log transformation
data <- complete_data_set %>%
select("SalePrice") %>%
na.omit %>%
mutate(LogSalePrice = log(SalePrice+1))
summary(data$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
cat("Skewness: ",skewness(data$SalePrice),"\n")
## Skewness: 1.880941
cat("Kurtosis: ",kurtosis(data$SalePrice))
## Kurtosis: 9.509812
A positive skew indicates that the tail is on the right side of the distribution, which extends towards more positive values and a kurtosis greater than 3, it is said to be
leptokurtic, which means it tends to produce more outliers than the normal distribution. To correct this we have used the log transformation of the response variable.
#Histogram plots of SalePrice and its log transformation
hist1 <- ggplot(data, aes(x = SalePrice, fill = ..count..), add=TRUE) +
geom_histogram(binwidth = 5000) +
ggtitle("Plot 1: Histogram plot of SalePrice") +
ylab("Frequency (Number of houses)") +
xlab("SalePrice") +
theme(plot.title = element_text(hjust = 0.5))
hist2 <- ggplot(data, aes(x = LogSalePrice, fill = ..count..)) +
geom_histogram(binwidth = 0.05) +
ggtitle("Plot 2: Histogram plot of log(SalePrice)") +
ylab("Frequency (Number of houses)") +
xlab("log(SalePrice)") +
theme(plot.title = element_text(hjust = 0.5))
cowplot::plot_grid(hist1, hist2)
Histogram plot to the left is positively skewed. Plot to the right represents the log transformation of the same response variable which follows normal distribution.
variables_with_na <- which(colSums(is.na(complete_data_set)) > 0)
sort(colSums(sapply(complete_data_set[variables_with_na], is.na)), decreasing = TRUE)
## PoolQC MiscFeature Alley Fence SalePrice FireplaceQu
## 2909 2814 2721 2348 1459 1420
## LotFrontage GarageYrBlt GarageFinish GarageQual GarageCond GarageType
## 486 159 159 159 159 157
## BsmtCond BsmtExposure BsmtQual BsmtFinType2 BsmtFinType1 MasVnrType
## 82 82 81 80 79 24
## MasVnrArea MSZoning Utilities BsmtFullBath BsmtHalfBath Functional
## 23 4 2 2 2 2
## Exterior1st Exterior2nd BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 1 1 1 1 1
## Electrical KitchenQual GarageCars GarageArea SaleType
## 1 1 1 1 1
We studied the NA values of all the variables above and found some discrepancies and some missing values that we corrected.
#Special Case: Basement specific variables, there is a mismatch in NA values in the below variables.
complete_data_set[ !is.na(complete_data_set$BsmtFinType1) & (
is.na(complete_data_set$BsmtCond) |
is.na(complete_data_set$BsmtQual) |
is.na(complete_data_set$BsmtExposure) |
is.na(complete_data_set$BsmtFinType2)),
c('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2')
]
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2
## 333 Gd TA No GLQ <NA>
## 949 Gd TA <NA> Unf Unf
## 1488 Gd TA <NA> Unf Unf
## 2041 Gd <NA> Mn GLQ Rec
## 2186 TA <NA> No BLQ Unf
## 2218 <NA> Fa No Unf Unf
## 2219 <NA> TA No Unf Unf
## 2349 Gd TA <NA> Unf Unf
## 2525 TA <NA> Av ALQ Unf
#Special Case: Garage specific variables, there is a mismatch in NA values, will check the entries
complete_data_set[ !is.na(complete_data_set$GarageType) & is.na(complete_data_set$GarageFinish),
c('GarageCars', 'GarageArea', 'GarageType', 'GarageCond', 'GarageQual', 'GarageFinish')
]
## GarageCars GarageArea GarageType GarageCond GarageQual GarageFinish
## 2127 1 360 Detchd <NA> <NA> <NA>
## 2577 NA NA Detchd <NA> <NA> <NA>
clean_na_func <- function(data_set_name) {
#Basement special cases, missing values
data_set_name$BsmtFinType2[333] <- names(which.max(table(data_set_name$BsmtFinType2)))
data_set_name$BsmtExposure[c(949, 1488, 2349)] <- names(which.max(table(data_set_name$BsmtExposure)))
data_set_name$BsmtCond[c(2041, 2186, 2525)] <- names(which.max(table(data_set_name$BsmtCond)))
data_set_name$BsmtQual[c(2218, 2219)] <- names(which.max(table(data_set_name$BsmtQual)))
#Garage special case, missing value
data_set_name$GarageFinish[2127] <- names(which.max(table(data_set_name$GarageFinish)))
data_set_name$GarageCond[2127] <- names(which.max(table(data_set_name$GarageCond)))
data_set_name$GarageQual[2127] <- names(which.max(table(data_set_name$GarageQual)))
data_set_name$GarageCars[2577] <- 0
data_set_name$GarageArea[2577] <- 0
data_set_name$GarageType[2577] <- "None"
#correcting the error
data_set_name$GarageYrBlt[2593] <- 2007 #GarageYrBlt=2207 looks like a typo, YearBuilt=2006, YearRemodAdd=2007.
# For below variables where an absence of a feature is NA value, replacing it with 'None'
ColVector <- c("Alley", "BsmtQual", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "PoolQC", "Fence",
"MiscFeature", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "BsmtCond")
data_set_name[,ColVector] <- apply(data_set_name[,ColVector], 2,
function(x) {
replace(x, is.na(x), "None")
}
)
# Missing value is replaced by 0 in below Numeric Variables
ColVector <- c( "MasVnrArea", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath")
data_set_name[,ColVector] <- apply(data_set_name[,ColVector], 2,
function(x) {
replace(x, is.na(x), 0)
}
)
# Replacing with the maximum value (mode values) in below variables
ColVector <- c("MSZoning", "Utilities", "Exterior1st", "Exterior2nd", "MasVnrType", "Electrical", "KitchenQual",
"Functional", "SaleType")
data_set_name[,ColVector] <- apply(data_set_name[,ColVector], 2,
function(x) {
replace(x, is.na(x), names(which.max(table(x))))
}
)
# Replacing the value with the median
ColVector <- c("LotFrontage","GarageCars", "GarageArea", "BsmtFinSF1")
data_set_name[,ColVector] <- apply(data_set_name[,ColVector], 2,
function(x) {
replace(x, is.na(x), median(x, na.rm = T))
}
)
# Assuming that the Garage was built the same year when the house was built to replace the NA values
data_set_name$GarageYrBlt[is.na(complete_data_set$GarageYrBlt)] <- data_set_name$YearBuilt[is.na(complete_data_set$GarageYrBlt)]
return(data_set_name)
}
complete_data_set <- clean_na_func(complete_data_set)
ggplot(data=complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=factor(MSZoning), y=SalePrice)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 1) +
scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +
ggtitle("Plot 3: Box plot of SalePrice by MSZoning")
complete_data_set %>%
select(MSZoning, GrLivArea) %>%
group_by(MSZoning) %>%
summarise_at(vars(GrLivArea), list(count = length,avg_houseSize = mean))
## # A tibble: 5 × 3
## MSZoning count avg_houseSize
## <chr> <int> <dbl>
## 1 C (all) 25 1134.
## 2 FV 139 1582.
## 3 RH 26 1478.
## 4 RL 2269 1536.
## 5 RM 460 1322.
Average size of a house in commercial is much smaller than that in village zone, so there is no data discrepancy in this predictor variable and can be used in the model.
convert_to_int_func <- function(data_set_name) {
#There are various columns which has below scores, we would convert then to integer values based on the quality.
score <- c('None'=0, 'Po'=1, 'Fa'=2, 'TA'=3, 'Gd'=4, 'Ex'=5)
data_set_name$PoolQC <- as.integer(revalue(data_set_name$PoolQC, score))
data_set_name$FireplaceQu <- as.integer(revalue(data_set_name$FireplaceQu, score))
data_set_name$KitchenQual <- as.integer(revalue(data_set_name$KitchenQual, score))
data_set_name$ExterQual <- as.integer(revalue(data_set_name$ExterQual, score))
data_set_name$ExterCond <- as.integer(revalue(data_set_name$ExterCond, score))
data_set_name$BsmtQual <- as.integer(revalue(data_set_name$BsmtQual, score))
data_set_name$BsmtCond <- as.integer(revalue(data_set_name$BsmtCond, score))
data_set_name$GarageQual <- as.integer(revalue(data_set_name$GarageQual, score))
data_set_name$GarageCond <- as.integer(revalue(data_set_name$GarageCond, score))
data_set_name$BsmtExposure <- as.integer(revalue(data_set_name$BsmtExposure,
c('None'=0, 'No'=1, 'Mn'=2, 'Av'=3, 'Gd'=4)))
data_set_name$BsmtFinType1 <- as.integer(revalue(data_set_name$BsmtFinType1,
c('None'=0, 'Unf'=1, 'LwQ'=2, 'Rec'=3, 'BLQ'=4, 'ALQ'=5, 'GLQ'=6)))
data_set_name$BsmtFinType2 <- as.integer(revalue(data_set_name$BsmtFinType2,
c('None'=0, 'Unf'=1, 'LwQ'=2, 'Rec'=3, 'BLQ'=4, 'ALQ'=5, 'GLQ'=6)))
data_set_name$LotShape <- as.integer(revalue(data_set_name$LotShape,
c('IR3'=0, 'IR2'=1, 'IR1'=2, 'Reg'=3)))
data_set_name$GarageFinish<- as.integer(revalue(data_set_name$GarageFinish,
c('None'=0, 'Unf'=1, 'RFn'=2, 'Fin'=3)))
data_set_name$MasVnrType <- as.integer(revalue(data_set_name$MasVnrType,
c('None'=0, 'BrkCmn'=0, 'BrkFace'=1, 'Stone'=2)))
data_set_name$Functional <- as.integer(revalue(data_set_name$Functional,
c('Sal'=0, 'Sev'=1, 'Maj2'=2, 'Maj1'=3, 'Mod'=4, 'Min2'=5, 'Min1'=6, 'Typ'=7)))
return(data_set_name)
}
complete_data_set <- convert_to_int_func(complete_data_set)
table(sapply(complete_data_set, class))
##
## character integer numeric
## 28 43 11
numeric_train_data <- complete_data_set %>% filter(Set == "Train") %>%
select(which(sapply(.,is.integer)), which(sapply(., is.numeric)))
#, use = "pairwise.complete.obs"
correlation <- cor( numeric_train_data ) %>%
as.matrix( sort( correlation[,'SalePrice'], decreasing = TRUE )) #sort on decreasing correlations with SalePrice
corrplot.mixed(correlation, tl.col="black", tl.pos = "lt")
> From the correlation plot, we found a list of variables that are highly correlated with each other so we can remove one of them.
column_drop_list <- c('Utilities','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','YearRemodAdd','YrSold',
'EnclosedPorch','X3SsnPorch','ScreenPorch','WoodDeckSF','GarageArea','GarageQual','GarageCond',
'ExterQual','ExterCond','KitchenQual','FireplaceQu','PoolArea','PoolQC'
)
complete_data_set <- complete_data_set[,!(names(complete_data_set) %in% column_drop_list)]
##Converting character variables to factors
table(sapply(complete_data_set, class))
##
## character integer numeric
## 27 35 13
# List of character variables
variable_classes <- sapply(complete_data_set, class)
names(variable_classes[which(variable_classes=="character")])
## [1] "MSZoning" "Street" "Alley" "LandContour"
## [5] "LotConfig" "LandSlope" "Neighborhood" "Condition1"
## [9] "Condition2" "BldgType" "HouseStyle" "RoofStyle"
## [13] "RoofMatl" "Exterior1st" "Exterior2nd" "Foundation"
## [17] "Heating" "HeatingQC" "CentralAir" "Electrical"
## [21] "GarageType" "PavedDrive" "Fence" "MiscFeature"
## [25] "SaleType" "SaleCondition" "Set"
convert_to_factor_func <- function(data_set_name) {
#Numeric factors that should be categorical
data_set_name$MoSold <- factor(data_set_name$MoSold)
data_set_name$MSSubClass <- factor(data_set_name$MSSubClass)
data_set_name <- data_set_name %>%
mutate(Neighborhood = as.factor(Neighborhood),
Neighborhood2 = fct_collapse(Neighborhood,
"BluesteSawyer" = c("Blueste", "Sawyer"),
"NPkVillMitchel" = c("NPkVill", "Mitchel")))
data_set_name <- data_set_name %>%
mutate(LotShape = as.factor(LotShape),
LotShape2 = fct_collapse(LotShape,
"IR23" = c("IR2", "IR3")))
data_set_name <- data_set_name %>%
mutate(LotConfig = as.factor(LotConfig),
LotConfig2 = if_else(LotConfig == "CulDSac", 1, 0))
#Finding the Variables with character class
variable_classes <- sapply(data_set_name, class)
character_variables <- names(variable_classes[which(variable_classes=="character")])
# Converting them to factor
data_set_name[character_variables] <- lapply(data_set_name[character_variables], factor)
return(data_set_name)
}
complete_data_set <- convert_to_factor_func(complete_data_set)
#GrLivArea
ggplot(data=complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=GrLivArea, y=SalePrice))+
geom_point() + geom_smooth(method = "lm", se=FALSE, aes(group=1)) +
scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +
geom_text_repel(aes(label = ifelse(complete_data_set$GrLivArea[!is.na(complete_data_set$SalePrice)]>4500, rownames(complete_data_set), '')))
#TotalBsmtSF
ggplot(data=complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=TotalBsmtSF, y=SalePrice))+
geom_point() + geom_smooth(method = "lm", se=FALSE, aes(group=1)) +
scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +
geom_text_repel(aes(label = ifelse(complete_data_set$TotalBsmtSF[!is.na(complete_data_set$SalePrice)]>4500, rownames(complete_data_set), '')))
complete_data_set <- complete_data_set[ !(complete_data_set$Id %in% c(524, 1299)), ]
complete_data_set %>%
filter(OverallCond == 2 & SalePrice > 12)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 89 50 C (all) 105 8470 Pave None 2 Lvl
## 2 251 30 RL 55 5350 Pave None 2 Lvl
## 3 379 20 RL 88 11394 Pave None 3 Lvl
## 4 399 30 RM 60 8967 Pave None 3 Lvl
## 5 677 70 RM 60 9600 Pave Grvl 3 Lvl
## LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle
## 1 Corner Gtl IDOTRR Feedr Feedr 1Fam 1.5Fin
## 2 Inside Gtl BrkSide Norm Norm 1Fam 1Story
## 3 Corner Gtl StoneBr Norm Norm 1Fam 1Story
## 4 Corner Gtl IDOTRR Norm Norm 1Fam 1Story
## 5 Inside Gtl OldTown Norm Norm 1Fam 2Story
## OverallQual OverallCond YearBuilt RoofStyle RoofMatl Exterior1st Exterior2nd
## 1 3 2 1915 Hip CompShg Plywood Plywood
## 2 3 2 1940 Gable CompShg Wd Sdng Plywood
## 3 9 2 2010 Hip CompShg VinylSd VinylSd
## 4 5 2 1920 Gable CompShg MetalSd MetalSd
## 5 4 2 1900 Gable CompShg AsbShng Stucco
## MasVnrType MasVnrArea Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
## 1 0 0 CBlock 3 2 1 1
## 2 0 0 CBlock 3 3 1 1
## 3 2 350 PConc 5 3 3 6
## 4 0 0 BrkTil 2 1 1 1
## 5 0 0 BrkTil 3 2 1 1
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## 1 0 1 0 1013 1013 GasA TA
## 2 0 1 0 728 728 GasA Ex
## 3 1445 1 0 411 1856 GasA Ex
## 4 0 1 0 961 961 GasA Gd
## 5 0 1 0 1095 1095 GasW Fa
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BedroomAbvGr
## 1 N SBrkr 1013 0 513 1526 2
## 2 Y SBrkr 1306 0 0 1306 3
## 3 Y SBrkr 1856 0 0 1856 1
## 4 Y Mix 1077 0 0 1077 2
## 5 N SBrkr 1095 679 0 1774 4
## KitchenAbvGr TotRmsAbvGrd Functional Fireplaces GarageType GarageYrBlt
## 1 1 6 7 0 None 1915
## 2 1 6 4 0 None 1940
## 3 1 8 7 1 Attchd 2010
## 4 1 6 2 0 Detchd 1920
## 5 2 8 5 0 2Types 1920
## GarageFinish GarageCars PavedDrive OpenPorchSF Fence MiscFeature MiscVal
## 1 0 0 N 0 MnPrv None 0
## 2 0 0 Y 0 GdWo Shed 450
## 3 3 3 Y 0 None None 0
## 4 1 1 N 0 None None 0
## 5 1 3 N 0 None None 0
## MoSold SaleType SaleCondition SalePrice Set TotalBaths RemodelAge
## 1 10 ConLD Abnorml 85000 Train 1.0 27
## 2 5 WD Normal 76500 Train 1.0 44
## 3 6 New Partial 394432 Train 2.5 0
## 4 11 WD Abnorml 67000 Train 1.0 57
## 5 5 WD Normal 87000 Train 3.0 56
## RemodelFlag TotalAreaSF TotalSF TotalPorchSF OverallScore GarageScore
## 1 1 2539 2539 156 6 0
## 2 1 2034 2034 263 6 0
## 3 0 3712 3712 113 18 9
## 4 1 2038 2038 0 10 1
## 5 1 2869 2869 90 8 4
## ExterScore KitchenScore FireplaceScore GarageAreaScore PoolScore
## 1 4 2 0 0 0
## 2 3 2 0 0 0
## 3 12 5 5 2502 0
## 4 6 3 0 338 0
## 5 9 6 0 1558 0
## Neighborhood2 LotShape2 LotConfig2
## 1 IDOTRR 2 0
## 2 BrkSide 2 0
## 3 StoneBr 3 0
## 4 IDOTRR 3 0
## 5 OldTown 3 0
complete_data_set <- complete_data_set %>%
mutate(OverallCond = if_else(Id == 379, 5L, OverallCond))
Id column is removed from both the datasets, Id is not a predictor.
train_cleaned <- complete_data_set %>%
filter(Set == "Train")
test_cleaned <- complete_data_set %>%
filter(Set == "Test")
#high correlation cor>0.3
for(col in colnames(train_cleaned)){
if(is.numeric(train_cleaned[,col])){
if(abs(cor(train_cleaned[,col],train_cleaned$SalePrice)) > 0.5 ){
print (c(col, abs(cor(train_cleaned[,col],train_cleaned$SalePrice))))
}
}
}
## [1] "OverallQual" "0.795774267424943"
## [1] "YearBuilt" "0.523608400701364"
## [1] "BsmtQual" "0.586674006281695"
## [1] "TotalBsmtSF" "0.651152911047542"
## [1] "X1stFlrSF" "0.631530379198999"
## [1] "GrLivArea" "0.734968164535933"
## [1] "TotRmsAbvGrd" "0.5377688162416"
## [1] "GarageYrBlt" "0.508718778483912"
## [1] "GarageFinish" "0.550254601749493"
## [1] "GarageCars" "0.641047007641277"
## [1] "SalePrice" "1"
## [1] "TotalBaths" "0.635896297561239"
## [1] "RemodelAge" "0.509705795839106"
## [1] "TotalAreaSF" "0.829041978106551"
## [1] "TotalSF" "0.829041978106551"
## [1] "OverallScore" "0.566758909826381"
## [1] "ExterScore" "0.578456395158882"
## [1] "FireplaceScore" "0.514058085218827"
## [1] "GarageAreaScore" "0.630015330054997"
# low correaltion cor<0.1
for(col in colnames(train_cleaned)){
if(is.numeric(train_cleaned[,col])){
if(abs(cor(train_cleaned[,col],train_cleaned$SalePrice))<0.1){
print(c(col,cor(train_cleaned[,col],train_cleaned$SalePrice)))
}
}
}
## [1] "Id" "-0.0216728932732659"
## [1] "OverallCond" "-0.0732347145215433"
## [1] "BsmtFinType2" "-0.00437071368987331"
## [1] "BsmtFinSF2" "-0.0114216653872752"
## [1] "LowQualFinSF" "-0.0256247637384778"
## [1] "MiscVal" "-0.0212031568730371"
#OverallQual
ggplot(data=complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=factor(OverallQual), y=SalePrice)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 1) +
scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)
#YearBuilt
ggplot(data=complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=YearBuilt^15, y=SalePrice))+
geom_point() + geom_smooth(method = "lm", se=FALSE, aes(group=1)) +
scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +
geom_text_repel(aes(label = ifelse(complete_data_set$YearBuilt[!is.na(complete_data_set$SalePrice)]>4500, rownames(complete_data_set), '')))
ggplot(complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=Neighborhood, y=SalePrice)) +
geom_bar(stat='summary', fun.y = "median", fill='blue') +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(breaks= seq(0, 800000, by=50000), labels = comma) +
geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3) +
geom_hline(yintercept=163000, linetype="dashed", color = "red") #dashed line is median SalePrice
hist.data.frame(complete_data_set %>%
select("GrLivArea","RemodelAge"))
hist.data.frame(complete_data_set %>%
select("GarageFinish","GarageCars","GarageAreaScore"))
hist.data.frame(complete_data_set %>%
select("OverallScore","ExterScore","FireplaceScore","KitchenScore"))
hist.data.frame(complete_data_set %>%
select("HeatingQC","CentralAir","FireplaceScore"))
hist.data.frame(complete_data_set %>%
select("BsmtQual","TotalBsmtSF","BsmtFinSF1","TotalPorchSF","KitchenAbvGr"))
hist.data.frame(complete_data_set %>%
select("X1stFlrSF","TotRmsAbvGrd","TotalBaths","TotalSF","OpenPorchSF"))
tc <- trainControl(method = "cv", number = 5)
set.seed(123)
(lm_model <- train(log(SalePrice+1) ~ (factor(OverallQual)+factor(BsmtQual)*TotalBsmtSF+YearBuilt^20+log(X1stFlrSF+1)*TotRmsAbvGrd
+factor(Neighborhood)+GarageFinish^3+GarageCars*TotalBaths+RemodelAge+
TotalSF^2+OverallScore+ExterScore*MSZoning+
factor(FireplaceScore)+Fireplaces+GarageAreaScore*log(GrLivArea+1)^14+
HeatingQC^4+CentralAir^2+TotalPorchSF+BsmtFinSF1*MasVnrArea^2
+KitchenScore*KitchenAbvGr^2+OverallQual+OpenPorchSF+TotalAreaSF),
data = select(train_cleaned, -Id),
preProcess = c("center","scale"),
trControl = tc,
method = "lm"))
## Linear Regression
##
## 1458 samples
## 28 predictor
##
## Pre-processing: centered (92), scaled (92)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1167, 1167, 1165, 1166, 1167
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1178501 0.9139376 0.0839691
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lm_model)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72639 -0.05050 0.00508 0.05927 0.39042
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.0240152 0.0029271 4107.856 < 2e-16
## `factor(OverallQual)2` 0.0033719 0.0048661 0.693 0.488466
## `factor(OverallQual)3` 0.0152254 0.0103134 1.476 0.140099
## `factor(OverallQual)4` 0.0496164 0.0233832 2.122 0.034027
## `factor(OverallQual)5` 0.0732820 0.0389946 1.879 0.060418
## `factor(OverallQual)6` 0.0631568 0.0388689 1.625 0.104421
## `factor(OverallQual)7` 0.0629336 0.0374449 1.681 0.093050
## `factor(OverallQual)8` 0.0536447 0.0294149 1.824 0.068411
## `factor(OverallQual)9` 0.0391766 0.0162109 2.417 0.015793
## `factor(OverallQual)10` 0.0202401 0.0105148 1.925 0.054446
## `factor(BsmtQual)2` 0.0209218 0.0112978 1.852 0.064263
## `factor(BsmtQual)3` 0.0015115 0.0157188 0.096 0.923407
## `factor(BsmtQual)4` -0.0062475 0.0152052 -0.411 0.681227
## `factor(BsmtQual)5` -0.0110794 0.0145039 -0.764 0.445062
## TotalBsmtSF -0.0174799 0.0220039 -0.794 0.427101
## YearBuilt 0.0687105 0.0085293 8.056 1.70e-15
## `log(X1stFlrSF + 1)` 0.0523072 0.0189461 2.761 0.005842
## TotRmsAbvGrd 0.2687182 0.0963312 2.790 0.005352
## `factor(Neighborhood)Blueste` -0.0006722 0.0032337 -0.208 0.835370
## `factor(Neighborhood)BrDale` -0.0061193 0.0050081 -1.222 0.221964
## `factor(Neighborhood)BrkSide` 0.0151974 0.0074044 2.052 0.040312
## `factor(Neighborhood)ClearCr` 0.0143696 0.0051830 2.772 0.005639
## `factor(Neighborhood)CollgCr` 0.0176097 0.0091815 1.918 0.055323
## `factor(Neighborhood)Crawfor` 0.0272218 0.0065665 4.146 3.60e-05
## `factor(Neighborhood)Edwards` -0.0049775 0.0084198 -0.591 0.554510
## `factor(Neighborhood)Gilbert` 0.0109144 0.0071586 1.525 0.127575
## `factor(Neighborhood)IDOTRR` 0.0082739 0.0069885 1.184 0.236644
## `factor(Neighborhood)MeadowV` -0.0108154 0.0049596 -2.181 0.029375
## `factor(Neighborhood)Mitchel` 0.0009986 0.0061792 0.162 0.871639
## `factor(Neighborhood)NAmes` 0.0072094 0.0116658 0.618 0.536679
## `factor(Neighborhood)NoRidge` 0.0156518 0.0059703 2.622 0.008849
## `factor(Neighborhood)NPkVill` -0.0024076 0.0038433 -0.626 0.531137
## `factor(Neighborhood)NridgHt` 0.0193344 0.0073596 2.627 0.008707
## `factor(Neighborhood)NWAmes` -0.0016783 0.0071954 -0.233 0.815605
## `factor(Neighborhood)OldTown` 0.0038260 0.0103913 0.368 0.712787
## `factor(Neighborhood)Sawyer` 0.0015358 0.0073959 0.208 0.835526
## `factor(Neighborhood)SawyerW` 0.0043890 0.0064740 0.678 0.497921
## `factor(Neighborhood)Somerst` 0.0186482 0.0090395 2.063 0.039304
## `factor(Neighborhood)StoneBr` 0.0179401 0.0048405 3.706 0.000219
## `factor(Neighborhood)SWISU` 0.0010811 0.0053820 0.201 0.840824
## `factor(Neighborhood)Timber` 0.0087753 0.0054509 1.610 0.107653
## `factor(Neighborhood)Veenker` 0.0084372 0.0039164 2.154 0.031387
## GarageFinish 0.0114660 0.0044744 2.563 0.010496
## GarageCars 0.0356202 0.0114257 3.118 0.001862
## TotalBaths 0.0323409 0.0089618 3.609 0.000319
## RemodelAge -0.0102093 0.0048319 -2.113 0.034789
## TotalSF 0.1723086 0.0341987 5.038 5.32e-07
## OverallScore 0.0775849 0.0059706 12.995 < 2e-16
## ExterScore -0.0152106 0.0349726 -0.435 0.663682
## MSZoningFV 0.1104842 0.0435973 2.534 0.011381
## MSZoningRH 0.0322097 0.0211992 1.519 0.128897
## MSZoningRL 0.1612668 0.0576083 2.799 0.005192
## MSZoningRM 0.0922414 0.0511111 1.805 0.071338
## `factor(FireplaceScore)1` 0.0043641 0.0066630 0.655 0.512598
## `factor(FireplaceScore)2` 0.0068817 0.0076451 0.900 0.368201
## `factor(FireplaceScore)3` 0.0151633 0.0199188 0.761 0.446634
## `factor(FireplaceScore)4` 0.0271374 0.0220991 1.228 0.219662
## `factor(FireplaceScore)5` 0.0072808 0.0067049 1.086 0.277719
## `factor(FireplaceScore)6` 0.0185677 0.0197627 0.940 0.347621
## `factor(FireplaceScore)8` 0.0246071 0.0196528 1.252 0.210751
## `factor(FireplaceScore)9` 0.0015631 0.0050299 0.311 0.756020
## `factor(FireplaceScore)10` 0.0011001 0.0062727 0.175 0.860806
## `factor(FireplaceScore)12` 0.0018226 0.0050184 0.363 0.716525
## `factor(FireplaceScore)15` 0.0012955 0.0050244 0.258 0.796560
## Fireplaces -0.0132099 0.0328540 -0.402 0.687689
## GarageAreaScore 0.1390485 0.0793377 1.753 0.079892
## `log(GrLivArea + 1)` 0.0208955 0.0210834 0.991 0.321816
## HeatingQCFa -0.0060860 0.0034395 -1.769 0.077044
## HeatingQCGd -0.0046495 0.0035210 -1.320 0.186889
## HeatingQCPo -0.0037036 0.0030324 -1.221 0.222168
## HeatingQCTA -0.0104059 0.0041372 -2.515 0.012009
## CentralAirY 0.0102371 0.0038877 2.633 0.008553
## TotalPorchSF 0.0177087 0.0037011 4.785 1.90e-06
## BsmtFinSF1 0.0297297 0.0049065 6.059 1.77e-09
## MasVnrArea 0.0015946 0.0052087 0.306 0.759551
## KitchenScore 0.0026301 0.0112829 0.233 0.815712
## KitchenAbvGr -0.0365976 0.0143460 -2.551 0.010848
## OverallQual NA NA NA NA
## OpenPorchSF 0.0004292 0.0036249 0.118 0.905773
## TotalAreaSF NA NA NA NA
## `factor(BsmtQual)2:TotalBsmtSF` -0.0338750 0.0111822 -3.029 0.002497
## `factor(BsmtQual)3:TotalBsmtSF` -0.0440706 0.0180919 -2.436 0.014980
## `factor(BsmtQual)4:TotalBsmtSF` -0.0372368 0.0198128 -1.879 0.060397
## `factor(BsmtQual)5:TotalBsmtSF` NA NA NA NA
## `log(X1stFlrSF + 1):TotRmsAbvGrd` -0.2814197 0.1044987 -2.693 0.007167
## `GarageCars:TotalBaths` -0.0216757 0.0155648 -1.393 0.163965
## `ExterScore:MSZoningFV` -0.0161061 0.0520054 -0.310 0.756836
## `ExterScore:MSZoningRH` 0.0114287 0.0219606 0.520 0.602857
## `ExterScore:MSZoningRL` 0.0186702 0.0786204 0.237 0.812325
## `ExterScore:MSZoningRM` 0.0377002 0.0618111 0.610 0.542012
## `GarageAreaScore:log(GrLivArea + 1)` -0.1246132 0.0830349 -1.501 0.133655
## `BsmtFinSF1:MasVnrArea` 0.0012162 0.0056251 0.216 0.828854
## `KitchenScore:KitchenAbvGr` 0.0175014 0.0204629 0.855 0.392550
##
## (Intercept) ***
## `factor(OverallQual)2`
## `factor(OverallQual)3`
## `factor(OverallQual)4` *
## `factor(OverallQual)5` .
## `factor(OverallQual)6`
## `factor(OverallQual)7` .
## `factor(OverallQual)8` .
## `factor(OverallQual)9` *
## `factor(OverallQual)10` .
## `factor(BsmtQual)2` .
## `factor(BsmtQual)3`
## `factor(BsmtQual)4`
## `factor(BsmtQual)5`
## TotalBsmtSF
## YearBuilt ***
## `log(X1stFlrSF + 1)` **
## TotRmsAbvGrd **
## `factor(Neighborhood)Blueste`
## `factor(Neighborhood)BrDale`
## `factor(Neighborhood)BrkSide` *
## `factor(Neighborhood)ClearCr` **
## `factor(Neighborhood)CollgCr` .
## `factor(Neighborhood)Crawfor` ***
## `factor(Neighborhood)Edwards`
## `factor(Neighborhood)Gilbert`
## `factor(Neighborhood)IDOTRR`
## `factor(Neighborhood)MeadowV` *
## `factor(Neighborhood)Mitchel`
## `factor(Neighborhood)NAmes`
## `factor(Neighborhood)NoRidge` **
## `factor(Neighborhood)NPkVill`
## `factor(Neighborhood)NridgHt` **
## `factor(Neighborhood)NWAmes`
## `factor(Neighborhood)OldTown`
## `factor(Neighborhood)Sawyer`
## `factor(Neighborhood)SawyerW`
## `factor(Neighborhood)Somerst` *
## `factor(Neighborhood)StoneBr` ***
## `factor(Neighborhood)SWISU`
## `factor(Neighborhood)Timber`
## `factor(Neighborhood)Veenker` *
## GarageFinish *
## GarageCars **
## TotalBaths ***
## RemodelAge *
## TotalSF ***
## OverallScore ***
## ExterScore
## MSZoningFV *
## MSZoningRH
## MSZoningRL **
## MSZoningRM .
## `factor(FireplaceScore)1`
## `factor(FireplaceScore)2`
## `factor(FireplaceScore)3`
## `factor(FireplaceScore)4`
## `factor(FireplaceScore)5`
## `factor(FireplaceScore)6`
## `factor(FireplaceScore)8`
## `factor(FireplaceScore)9`
## `factor(FireplaceScore)10`
## `factor(FireplaceScore)12`
## `factor(FireplaceScore)15`
## Fireplaces
## GarageAreaScore .
## `log(GrLivArea + 1)`
## HeatingQCFa .
## HeatingQCGd
## HeatingQCPo
## HeatingQCTA *
## CentralAirY **
## TotalPorchSF ***
## BsmtFinSF1 ***
## MasVnrArea
## KitchenScore
## KitchenAbvGr *
## OverallQual
## OpenPorchSF
## TotalAreaSF
## `factor(BsmtQual)2:TotalBsmtSF` **
## `factor(BsmtQual)3:TotalBsmtSF` *
## `factor(BsmtQual)4:TotalBsmtSF` .
## `factor(BsmtQual)5:TotalBsmtSF`
## `log(X1stFlrSF + 1):TotRmsAbvGrd` **
## `GarageCars:TotalBaths`
## `ExterScore:MSZoningFV`
## `ExterScore:MSZoningRH`
## `ExterScore:MSZoningRL`
## `ExterScore:MSZoningRM`
## `GarageAreaScore:log(GrLivArea + 1)`
## `BsmtFinSF1:MasVnrArea`
## `KitchenScore:KitchenAbvGr`
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1118 on 1368 degrees of freedom
## Multiple R-squared: 0.9266, Adjusted R-squared: 0.9218
## F-statistic: 194 on 89 and 1368 DF, p-value: < 2.2e-16
predictions <- data.frame(Id = test_cleaned$Id,
lm = exp(predict(lm_model, newdata= test_cleaned))
)
head(predictions)
## Id lm
## 1 1461 120070.5
## 2 1462 152337.9
## 3 1463 183249.0
## 4 1464 207566.9
## 5 1465 203255.8
## 6 1466 174096.9
rmse <- function(actual, predicted) sqrt(mean((actual - predicted)^2))
rmse(train_cleaned$SalePrice, exp(predict(lm_model, newdata= train_cleaned)) )
## [1] 20390.52
r2 <- function(actual, predicted){
TSS <- sum((actual - mean(actual))^2)
RSS <- sum((actual - predicted)^2)
1 - RSS/TSS
}
r2(train_cleaned$SalePrice, exp(predict(lm_model, newdata= train_cleaned)))
## [1] 0.9341623
plot(exp(predict(lm_model, newdata= train_cleaned)), # Draw plot using Base R
train_cleaned$SalePrice,
xlab = "Predicted Values",
ylab = "Observed Values")
abline(a = 0, # Add straight line
b = 1,
col = "red",
lwd = 2)
#kaggle score of 0.13
submit <- test_cleaned %>%
select(Id) %>%
mutate(SalePrice = exp(predict(object = lm_model, newdata = test_cleaned)))
write.csv(submit, "kaggle_submit.csv", row.names = F )
head(submit)
## Id SalePrice
## 1 1461 120070.5
## 2 1462 152337.9
## 3 1463 183249.0
## 4 1464 207566.9
## 5 1465 203255.8
## 6 1466 174096.9